# 读取原始的数据集
import os

import numpy as np
import pandas as pd

curPath = os.path.abspath(os.path.dirname(__file__))
trainData = pd.read_csv(os.path.join(curPath, "../titanic/data/train.csv"))
# 获得y数据集
try:
    y_data = trainData["Surviveddd"].values.astype(np.float32).reshape(-1, 1)
except:
    pass

# 去除无效行
trainData.drop(["PassengerId", "Survived", "Name", "Ticket", "Cabin", "Embarked"], axis=1, inplace=True)
theLength = len(trainData)
# 遍历读到的数据进行预处理
for i in trainData.index:
    trainData.loc[i, "Sex"] = 0 if trainData.loc[i, "Sex"] == "male" else 1
# 获得age的中位数
avg = trainData["Age"].mode()[0]
# 将中位数填充nan值
trainData["Age"] = trainData["Age"].fillna(avg)
